{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-16T07:46:49.678700Z",
     "start_time": "2020-11-16T07:46:49.673336Z"
    },
    "uuid": "c974807d-519b-4ac0-a2c4-b0a5624197a7"
   },
   "outputs": [],
   "source": [
    "import pandas as pd  \n",
    "import numpy as np \n",
    "from NEWSREC import *\n",
    "import os, pickle\n",
    "import lightgbm as lgb\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "data_path = '../tcdata/'\n",
    "save_path = '../user_data/'\n",
    "pred_path ='../prediction_result/'\n",
    "\n",
    "train_name='train_click_log.csv'\n",
    "test_name='testB_click_log.csv'\n",
    "article_name='articles.csv'\n",
    "user_info_name='user_info.csv'\n",
    "#如果A榜，则改为test_name='testA_click_log.csv'\n",
    "\n",
    "max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))\n",
    "\n",
    "all_click_df = get_all_click_df(data_path, offline=False,train_name=train_name, test_name=test_name)\n",
    "all_click_df['click_timestamp'] = all_click_df[['click_timestamp']].apply(max_min_scaler)\n",
    "\n",
    "item_info_df = get_item_info_df(data_path)\n",
    "\n",
    "if os.path.exists(save_path + 'item_content_emb.pkl'):\n",
    "    item_emb_dict=pickle.load(open(save_path + 'item_content_emb.pkl', 'rb'))\n",
    "else:\n",
    "    item_emb_dict = get_item_emb_dict(data_path,save_path)\n",
    "\n",
    "item_type_dict, item_words_dict, item_created_time_dict = get_item_info_dict(item_info_df)\n",
    "\n",
    "if test_name=='testB_click_log.csv' and os.path.exists(\"../user_data/datastore/trn_hist_click_df.csv\") \\\n",
    "and os.path.exists(\"../user_data/datastore/trn_last_click_df.csv\"):\n",
    "    trn_hist_click_df=pd.read_csv(\"../user_data/datastore/trn_hist_click_df.csv\")\n",
    "    trn_last_click_df=pd.read_csv(\"../user_data/datastore/trn_last_click_df.csv\")\n",
    "else:\n",
    "    trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n",
    "    trn_hist_click_df.to_csv(\"../user_data/datastore/trn_hist_click_df.csv\",index=False)\n",
    "    trn_last_click_df.to_csv(\"../user_data/datastore/trn_last_click_df.csv\",index=False)\n",
    "\n",
    "if test_name=='testB_click_log.csv' and os.path.exists(save_path + 'itemcf_i2i_sim_7_5_6_havetest.pkl'):\n",
    "    i2i_sim = pickle.load(open(save_path + 'itemcf_i2i_sim_7_5_6_havetest.pkl', 'rb'))\n",
    "else:\n",
    "    i2i_sim = itemcf_sim(save_path, all_click_df, item_created_time_dict, offline=False)\n",
    "\n",
    "item_emb_df = pd.read_csv(data_path + 'articles_emb.csv')\n",
    "if os.path.exists(save_path + 'emb_i2i_sim.pkl'):\n",
    "    emb_i2i_sim = pickle.load(open(save_path + 'emb_i2i_sim.pkl', 'rb'))\n",
    "else:\n",
    "    emb_i2i_sim = embdding_sim(all_click_df, item_emb_df, save_path, topk=10) \n",
    "\n",
    "if test_name=='testB_click_log.csv' and os.path.exists(save_path + 'itemcf_recall_dict_756_001_3_250_topk50_num10_havetest.pkl'):\n",
    "    pass\n",
    "else:\n",
    "    get_recall(save_path, all_click_df, item_created_time_dict, metric_recall=False)\n",
    "\n",
    "all_click_df = get_all_click_df(data_path, offline=True, train_name=train_name, test_name=test_name)\n",
    "all_click_df['click_timestamp'] = all_click_df[['click_timestamp']].apply(max_min_scaler)\n",
    "\n",
    "if os.path.exists(\"../user_data/datastore/trn_hist_click_df_notest1-11-1.csv\") and os.path.exists(\"../user_data/datastore/trn_last_click_df_notest1-11-1.csv\"):\n",
    "    trn_hist_click_df=pd.read_csv(\"../user_data/datastore/trn_hist_click_df_notest1-11-1.csv\")\n",
    "    trn_last_click_df=pd.read_csv(\"../user_data/datastore/trn_last_click_df_notest1-11-1.csv\")\n",
    "else:\n",
    "    trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n",
    "    trn_hist_click_df.to_csv(\"../user_data/datastore/trn_hist_click_df_notest1-11-1.csv\",index=False)\n",
    "    trn_last_click_df.to_csv(\"../user_data/datastore/trn_last_click_df_notest1-11-1.csv\",index=False)\n",
    "\n",
    "if os.path.exists(save_path + 'itemcf_i2i_sim_7_5_6_notest.pkl'):\n",
    "    i2i_sim = pickle.load(open(save_path + 'itemcf_i2i_sim_7_5_6_notest.pkl', 'rb'))\n",
    "else:\n",
    "    i2i_sim = itemcf_sim(save_path, all_click_df, item_created_time_dict, offline=True)\n",
    "\n",
    "if os.path.exists(save_path + 'itemcf_recall_dict_756_001_3_250_topk50_num10_notest.pkl'):\n",
    "    pass\n",
    "else:\n",
    "    get_recall(save_path, all_click_df, item_created_time_dict, metric_recall=True)\n",
    "\n",
    "click_trn, click_val, click_tst, val_ans = get_trn_val_tst_data(data_path,train_name,test_name,validation_set=False)\n",
    "\n",
    "#click_trn_hist, click_trn_last = get_hist_and_last_click(click_trn)\n",
    "click_trn_hist=pd.read_csv(\"../user_data/datastore/trn_hist_click_df_notest1-11-1.csv\")\n",
    "click_trn_last=pd.read_csv(\"../user_data/datastore/trn_last_click_df_notest1-11-1.csv\")\n",
    "if click_val is not None:\n",
    "    click_val_hist, click_val_last = click_val, val_ans\n",
    "else:\n",
    "    click_val_hist, click_val_last = None, None    \n",
    "click_tst_hist = click_tst\n",
    "\n",
    "recall_list_dict =pickle.load(open(save_path + 'itemcf_recall_dict_756_001_3_250_topk50_num10_notest.pkl', 'rb'))\n",
    "recall_list_dict2=pickle.load(open(save_path + 'itemcf_recall_dict_756_001_3_250_topk50_num10_havetest.pkl', 'rb'))\n",
    "recall_list_df = recall_dict_2_df(recall_list_dict)\n",
    "recall_list_df2= recall_dict_2_df(recall_list_dict2)\n",
    "\n",
    "trn_user_item_label_df, val_user_item_label_df, tst_user_item_label_df = get_user_recall_item_label_df(click_trn_hist, \n",
    "                                                                                                       click_val_hist, \n",
    "                                                                                                       click_tst_hist,\n",
    "                                                                                                       click_trn_last, \n",
    "                                                                                                       click_val_last, \n",
    "                                                                                                       recall_list_df,\n",
    "                                                                                                       recall_list_df2,\n",
    "                                                                                                       click_val\n",
    "                                                                                                      )\n",
    "\n",
    "article_info_df = get_article_info_df(data_path,article_name=article_name)\n",
    "all_click = click_trn.append(click_tst)\n",
    "item_content_emb_dict, item_w2v_emb_dict= get_embedding(save_path, all_click)\n",
    "\n",
    "if os.path.exists(save_path + 'trn_user_item_feats_df.csv'):\n",
    "    pass\n",
    "else:\n",
    "    trn_user_item_label_tuples = trn_user_item_label_df.groupby('user_id').apply(make_tuple_func).reset_index()\n",
    "    trn_user_item_label_tuples_dict = dict(zip(trn_user_item_label_tuples['user_id'], trn_user_item_label_tuples[0]))\n",
    "    if val_user_item_label_df is not None:\n",
    "        val_user_item_label_tuples = val_user_item_label_df.groupby('user_id').apply(make_tuple_func).reset_index()\n",
    "        val_user_item_label_tuples_dict = dict(zip(val_user_item_label_tuples['user_id'], val_user_item_label_tuples[0]))\n",
    "    else:\n",
    "        val_user_item_label_tuples_dict = None \n",
    "    trn_user_item_feats_df = create_feature(trn_user_item_label_tuples_dict.keys(), trn_user_item_label_tuples_dict, \\\n",
    "                                                click_trn_hist, article_info_df, item_content_emb_dict)\n",
    "    if val_user_item_label_tuples_dict is not None:\n",
    "        val_user_item_feats_df = create_feature(val_user_item_label_tuples_dict.keys(), val_user_item_label_tuples_dict, \\\n",
    "                                                    click_val_hist, article_info_df, item_content_emb_dict)\n",
    "    else:\n",
    "        val_user_item_feats_df = None\n",
    "    trn_user_item_feats_df.to_csv(save_path + 'trn_user_item_feats_df.csv', index=False)\n",
    "    if val_user_item_feats_df is not None:\n",
    "        val_user_item_feats_df.to_csv(save_path + 'val_user_item_feats_df.csv', index=False)\n",
    "\n",
    "if test_name=='testB_click_log.csv' and os.path.exists(save_path + 'tst_user_item_feats_df.csv'):\n",
    "    pass\n",
    "else:\n",
    "    tst_user_item_label_tuples = tst_user_item_label_df.groupby('user_id').apply(make_tuple_func).reset_index()\n",
    "    tst_user_item_label_tuples_dict = dict(zip(tst_user_item_label_tuples['user_id'], tst_user_item_label_tuples[0]))\n",
    "    tst_user_item_feats_df = create_feature(tst_user_item_label_tuples_dict.keys(), tst_user_item_label_tuples_dict, \\\n",
    "                                                click_tst_hist, article_info_df, item_content_emb_dict)\n",
    "    tst_user_item_feats_df.to_csv(save_path + 'tst_user_item_feats_df.csv', index=False) \n",
    "\n",
    "if test_name=='testB_click_log.csv' and os.path.exists(save_path + 'user_info.csv'):\n",
    "    pass\n",
    "else:\n",
    "    articles =  pd.read_csv(data_path+article_name)\n",
    "    articles = reduce_mem(articles)\n",
    "    all_data = click_trn.append(click_tst)\n",
    "    if click_val is not None:\n",
    "        all_data = all_data.append(click_val)\n",
    "    all_data = reduce_mem(all_data)\n",
    "    all_data = all_data.merge(articles, left_on='click_article_id', right_on='article_id') \n",
    "    user_act_fea = active_level(all_data, ['user_id', 'click_article_id', 'click_timestamp'])\n",
    "    article_hot_fea = hot_level(all_data, ['user_id', 'click_article_id', 'click_timestamp'])    \n",
    "    device_cols = ['user_id', 'click_environment', 'click_deviceGroup', 'click_os', 'click_country', 'click_region', 'click_referrer_type']\n",
    "    user_device_info = device_fea(all_data, device_cols)\n",
    "    user_time_hob_cols = ['user_id', 'click_timestamp', 'created_at_ts']\n",
    "    user_time_hob_info = user_time_hob_fea(all_data, user_time_hob_cols)\n",
    "    user_category_hob_cols = ['user_id', 'category_id']\n",
    "    user_cat_hob_info = user_cat_hob_fea(all_data, user_category_hob_cols)\n",
    "    user_wcou_info = all_data.groupby('user_id')['words_count'].agg('mean').reset_index()\n",
    "    user_wcou_info.rename(columns={'words_count': 'words_hbo'}, inplace=True)\n",
    "    user_info = pd.merge(user_act_fea, user_device_info, on='user_id')\n",
    "    user_info = user_info.merge(user_time_hob_info, on='user_id')\n",
    "    user_info = user_info.merge(user_cat_hob_info, on='user_id')\n",
    "    user_info = user_info.merge(user_wcou_info, on='user_id')\n",
    "    user_info.to_csv(save_path + user_info_name, index=False)   \n",
    "\n",
    "if test_name=='testB_click_log.csv' and os.path.exists(save_path + 'trn_user_item_feats_df_finalversion.csv') and \\\n",
    "os.path.exists(save_path + 'tst_user_item_feats_df_finalversion.csv'):\n",
    "    pass\n",
    "else:\n",
    "    user_info = pd.read_csv(save_path + user_info_name)\n",
    "    if os.path.exists(save_path + 'trn_user_item_feats_df.csv'):\n",
    "        trn_user_item_feats_df = pd.read_csv(save_path + 'trn_user_item_feats_df.csv')\n",
    "    if os.path.exists(save_path + 'tst_user_item_feats_df.csv'):\n",
    "        tst_user_item_feats_df = pd.read_csv(save_path + 'tst_user_item_feats_df.csv')\n",
    "    if os.path.exists(save_path + 'val_user_item_feats_df.csv'):\n",
    "        val_user_item_feats_df = pd.read_csv(save_path + 'val_user_item_feats_df.csv')\n",
    "    else:\n",
    "        val_user_item_feats_df = None   \n",
    "    trn_user_item_feats_df = trn_user_item_feats_df.merge(user_info, on='user_id', how='left')\n",
    "    if val_user_item_feats_df is not None:\n",
    "        val_user_item_feats_df = val_user_item_feats_df.merge(user_info, on='user_id', how='left')\n",
    "    else:\n",
    "        val_user_item_feats_df = None   \n",
    "    tst_user_item_feats_df = tst_user_item_feats_df.merge(user_info, on='user_id',how='left') \n",
    "    articles =  pd.read_csv(data_path+article_name)\n",
    "    articles = reduce_mem(articles)\n",
    "    trn_user_item_feats_df = trn_user_item_feats_df.merge(articles, left_on='click_article_id', right_on='article_id')\n",
    "    if val_user_item_feats_df is not None:\n",
    "        val_user_item_feats_df = val_user_item_feats_df.merge(articles, left_on='click_article_id', right_on='article_id')\n",
    "    else:\n",
    "        val_user_item_feats_df = None\n",
    "    tst_user_item_feats_df = tst_user_item_feats_df.merge(articles, left_on='click_article_id', right_on='article_id') \n",
    "    trn_user_item_feats_df['is_cat_hab'] = trn_user_item_feats_df.apply(lambda x: 1 if x.category_id in set(x.cate_list) else 0, axis=1)\n",
    "    if val_user_item_feats_df is not None:\n",
    "        val_user_item_feats_df['is_cat_hab'] = val_user_item_feats_df.apply(lambda x: 1 if x.category_id in set(x.cate_list) else 0, axis=1)\n",
    "    else:\n",
    "        val_user_item_feats_df = None\n",
    "    tst_user_item_feats_df['is_cat_hab'] = tst_user_item_feats_df.apply(lambda x: 1 if x.category_id in set(x.cate_list) else 0, axis=1)    \n",
    "    del trn_user_item_feats_df['cate_list']\n",
    "    if val_user_item_feats_df is not None:\n",
    "        del val_user_item_feats_df['cate_list']\n",
    "    else:\n",
    "        val_user_item_feats_df = None    \n",
    "    del tst_user_item_feats_df['cate_list']\n",
    "    del trn_user_item_feats_df['article_id']\n",
    "    if val_user_item_feats_df is not None:\n",
    "        del val_user_item_feats_df['article_id']\n",
    "    else:\n",
    "        val_user_item_feats_df = None  \n",
    "    del tst_user_item_feats_df['article_id']    \n",
    "    trn_user_item_feats_df.to_csv(save_path + 'trn_user_item_feats_df_finalversion.csv', index=False)\n",
    "    if val_user_item_feats_df is not None:\n",
    "        val_user_item_feats_df.to_csv(save_path + 'val_user_item_feats_df_finalversion.csv', index=False)\n",
    "    tst_user_item_feats_df.to_csv(save_path + 'tst_user_item_feats_df_finalversion.csv', index=False)    \n",
    "\n",
    "have_validation = False\n",
    "trn_user_item_feats_df = pd.read_csv(save_path + 'trn_user_item_feats_df_finalversion.csv')\n",
    "trn_user_item_feats_df['click_article_id'] = trn_user_item_feats_df['click_article_id'].astype(int)\n",
    "if have_validation:\n",
    "    val_user_item_feats_df = pd.read_csv(save_path + 'val_user_item_feats_df_finalversion.csv')\n",
    "    val_user_item_feats_df['click_article_id'] = val_user_item_feats_df['click_article_id'].astype(int)\n",
    "else:\n",
    "    val_user_item_feats_df = None\n",
    "tst_user_item_feats_df = pd.read_csv(save_path + 'tst_user_item_feats_df_finalversion.csv')\n",
    "tst_user_item_feats_df['click_article_id'] = tst_user_item_feats_df['click_article_id'].astype(int)\n",
    "del tst_user_item_feats_df['label']\n",
    "\n",
    "trn_user_item_feats_df_rank_model = trn_user_item_feats_df.copy()\n",
    "if have_validation:\n",
    "    val_user_item_feats_df_rank_model = val_user_item_feats_df.copy() \n",
    "tst_user_item_feats_df_rank_model = tst_user_item_feats_df.copy()\n",
    "\n",
    "lgb_cols = ['sim0', 'time_diff0', 'word_diff0','sim_max', 'sim_min', 'sim_sum', \n",
    "            'sim_mean', 'score','click_size', 'time_diff_mean', 'active_level',\n",
    "            'click_environment','click_deviceGroup', 'click_os', 'click_country', \n",
    "            'click_region','click_referrer_type', 'user_time_hob1', 'user_time_hob2',\n",
    "            'words_hbo', 'category_id', 'created_at_ts','words_count']\n",
    "\n",
    "trn_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)\n",
    "g_train = trn_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()[\"label\"].values\n",
    "if have_validation:\n",
    "    val_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)\n",
    "    g_val = val_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()[\"label\"].values\n",
    "\n",
    "def get_kfold_users(trn_df, n=5):\n",
    "    user_ids = trn_df['user_id'].unique()\n",
    "    user_set = [user_ids[i::n] for i in range(n)]\n",
    "    return user_set\n",
    "k_fold = 5\n",
    "trn_df = trn_user_item_feats_df_rank_model\n",
    "user_set = get_kfold_users(trn_df, n=k_fold)\n",
    "score_list = []\n",
    "score_df = trn_df[['user_id', 'click_article_id','label']]\n",
    "sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])\n",
    "for n_fold, valid_user in enumerate(user_set):\n",
    "    train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] \n",
    "    valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]\n",
    "    train_idx.sort_values(by=['user_id'], inplace=True)\n",
    "    g_train = train_idx.groupby(['user_id'], as_index=False).count()[\"label\"].values\n",
    "    valid_idx.sort_values(by=['user_id'], inplace=True)\n",
    "    g_val = valid_idx.groupby(['user_id'], as_index=False).count()[\"label\"].values\n",
    "    lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,\n",
    "                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,\n",
    "                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16)  \n",
    "    lgb_ranker.fit(train_idx[lgb_cols], train_idx['label'], group=g_train,\n",
    "                   eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], eval_group= [g_val], \n",
    "                   eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )\n",
    "    valid_idx['pred_score'] = lgb_ranker.predict(valid_idx[lgb_cols], num_iteration=lgb_ranker.best_iteration_)\n",
    "    valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))  \n",
    "    valid_idx.sort_values(by=['user_id', 'pred_score'])\n",
    "    valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n",
    "    score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])\n",
    "    if not have_validation:\n",
    "        sub_preds += lgb_ranker.predict(tst_user_item_feats_df_rank_model[lgb_cols], lgb_ranker.best_iteration_)\n",
    "score_df_ = pd.concat(score_list, axis=0)\n",
    "score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])\n",
    "score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_ranker_feats.csv', index=False)\n",
    "tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold\n",
    "tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))\n",
    "tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])\n",
    "tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n",
    "tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_ranker_feats.csv', index=False)\n",
    "\n",
    "def get_kfold_users(trn_df, n=5):\n",
    "    user_ids = trn_df['user_id'].unique()\n",
    "    user_set = [user_ids[i::n] for i in range(n)]\n",
    "    return user_set\n",
    "k_fold = 5\n",
    "trn_df = trn_user_item_feats_df_rank_model\n",
    "user_set = get_kfold_users(trn_df, n=k_fold)\n",
    "score_list = []\n",
    "score_df = trn_df[['user_id', 'click_article_id', 'label']]\n",
    "sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])\n",
    "for n_fold, valid_user in enumerate(user_set):\n",
    "    train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] \n",
    "    valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]\n",
    "    lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,\n",
    "                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,\n",
    "                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10)  \n",
    "    lgb_Classfication.fit(train_idx[lgb_cols], train_idx['label'],eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], \n",
    "                          eval_metric=['auc', ],early_stopping_rounds=50, )\n",
    "    valid_idx['pred_score'] = lgb_Classfication.predict_proba(valid_idx[lgb_cols], \n",
    "                                                              num_iteration=lgb_Classfication.best_iteration_)[:,1]\n",
    "    valid_idx.sort_values(by=['user_id', 'pred_score'])\n",
    "    valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n",
    "    score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])\n",
    "    if not have_validation:\n",
    "        sub_preds += lgb_Classfication.predict_proba(tst_user_item_feats_df_rank_model[lgb_cols], \n",
    "                                                     num_iteration=lgb_Classfication.best_iteration_)[:,1]\n",
    "score_df_ = pd.concat(score_list, axis=0)\n",
    "score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])\n",
    "score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_cls_feats.csv', index=False)\n",
    "tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold\n",
    "tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))\n",
    "tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])\n",
    "tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n",
    "tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_cls_feats.csv', index=False)\n",
    "\n",
    "trn_lgb_ranker_feats = pd.read_csv(save_path + 'trn_lgb_ranker_feats.csv')\n",
    "trn_lgb_cls_feats = pd.read_csv(save_path + 'trn_lgb_cls_feats.csv')\n",
    "tst_lgb_ranker_feats = pd.read_csv(save_path + 'tst_lgb_ranker_feats.csv')\n",
    "tst_lgb_cls_feats = pd.read_csv(save_path + 'tst_lgb_cls_feats.csv')\n",
    "\n",
    "finall_trn_ranker_feats = trn_lgb_ranker_feats[['user_id', 'click_article_id', 'label']]\n",
    "finall_tst_ranker_feats = tst_lgb_ranker_feats[['user_id', 'click_article_id']]\n",
    "for idx, trn_model in enumerate([trn_lgb_ranker_feats, trn_lgb_cls_feats]):\n",
    "    for feat in [ 'pred_score', 'pred_rank']:\n",
    "        col_name = feat + '_' + str(idx)\n",
    "        finall_trn_ranker_feats[col_name] = trn_model[feat]\n",
    "for idx, tst_model in enumerate([tst_lgb_ranker_feats, tst_lgb_cls_feats]):\n",
    "    for feat in [ 'pred_score', 'pred_rank']:\n",
    "        col_name = feat + '_' + str(idx)\n",
    "        finall_tst_ranker_feats[col_name] = tst_model[feat]\n",
    "\n",
    "feat_cols = ['pred_score_0', 'pred_rank_0', 'pred_score_1', 'pred_rank_1']\n",
    "trn_x = finall_trn_ranker_feats[feat_cols]\n",
    "trn_y = finall_trn_ranker_feats['label']\n",
    "tst_x = finall_tst_ranker_feats[feat_cols]\n",
    "lr = LogisticRegression()\n",
    "lr.fit(trn_x, trn_y)\n",
    "finall_tst_ranker_feats['pred_score'] = lr.predict_proba(tst_x)[:, 1]\n",
    "\n",
    "rank_results = finall_tst_ranker_feats[['user_id', 'click_article_id', 'pred_score']]\n",
    "submit(pred_path, rank_results, topk=5)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
   "autoclose": false,
   "autocomplete": true,
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 1,
   "hotkeys": {
    "equation": "Ctrl-E",
    "itemize": "Ctrl-I"
   },
   "labels_anchors": false,
   "latex_user_defs": false,
   "report_style_numbering": false,
   "user_envs_cfg": false
  },
  "tianchi_metadata": {
   "competitions": [],
   "datasets": [
    {
     "id": "83580",
     "title": "零基础入门推荐系统 - 新闻推荐"
    }
   ],
   "description": "",
   "notebookId": "143888",
   "source": "dsw"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "170px"
   },
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
